From http://pandas.pydata.org/pandas-docs/stable/10min.html and http://pandas.pydata.org/pandas-docs/stable/basics.htm
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[2]:
In [5]:
df = pd.DataFrame(np.random.randn(6,4), index=pd.date_range('20130101', periods=6)
, columns=list('ABCD'))
df
Out[5]:
In [8]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
Out[8]:
In [25]:
wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
major_axis=pd.date_range('1/1/2000', periods=5),
minor_axis=['A', 'B', 'C', 'D'])
wp
Out[25]:
In [9]:
print df.dtypes
df2.dtypes
Out[9]:
In [10]:
df2.var
Out[10]:
In [11]:
df.T
Out[11]:
In [12]:
df.all
Out[12]:
In [13]:
df.columns
Out[13]:
In [27]:
df.index
Out[27]:
In [15]:
df[:2]
Out[15]:
In [16]:
df[2:]
Out[16]:
In [26]:
print s.values
print df.values
wp.values
Out[26]:
In [28]:
df.describe()
Out[28]:
In [29]:
s.describe()
Out[29]:
In [42]:
print df2.E
print df2['D']
df[['a','c']]
Out[42]:
In [44]:
df[0:3]
Out[44]:
In [45]:
df['20130102':'20130104']
Out[45]:
In [48]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[48]:
In [70]:
print df.loc[dates[0]]
print df.loc[dates[0:2]]
print df.loc[dates[-1]]
In [68]:
df.iloc[-1]
Out[68]:
In [84]:
df
Out[84]:
In [82]:
df[df.A > 0]
Out[82]:
In [83]:
df[df > 0]
Out[83]:
In [89]:
df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']
df2
Out[89]:
In [90]:
df2[df2['E'].isin(['two','four'])]
Out[90]:
In [62]:
df.loc[:,['A','B']]
Out[62]:
In [63]:
df.loc['20130102':'20130104',['A','B']]
Out[63]:
In [64]:
df.loc['20130102',['A','B']]
Out[64]:
In [71]:
df.iloc[3:5,0:2]
Out[71]:
In [73]:
df.iloc[[3,5],[0,2]]
Out[73]:
In [75]:
df
Out[75]:
In [76]:
df.iloc[1:3,:]
Out[76]:
In [77]:
df.iloc[:,1:3]
Out[77]:
In [80]:
print df.loc[dates[0],'A']
# Fast access:
print df.at[dates[0],'A']
print df.iloc[0,0]
# Fast access:
df.iat[0,0]
Out[80]:
In [21]:
df.columns = [x.lower() for x in df.columns]
df
Out[21]:
In [32]:
df.sort_index(axis=1, ascending=False)
Out[32]:
In [36]:
df.sort(columns='b')
Out[36]:
In [91]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
s1
Out[91]:
In [92]:
df['F'] = s1
df
Out[92]:
In [94]:
df.at[dates[0],'A'] = 0
df
Out[94]:
In [95]:
df.iat[0,1] = 0
df
Out[95]:
In [97]:
df.loc[:,'D'] = np.array([5] * len(df))
df
Out[97]:
In [98]:
df2 = df.copy()
df2[df2 > 0] = -df2
df2
Out[98]:
In [108]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1
Out[108]:
In [109]:
df1.loc[dates[0]:dates[1],'E'] = 1
df1
Out[109]:
In [110]:
df1.dropna(how='any')
Out[110]:
In [112]:
df1.fillna(value=5)
Out[112]:
In [113]:
pd.isnull(df1)
Out[113]:
In [117]:
df
Out[117]:
In [139]:
df.describe(percentiles=[.05, .25, .75, .95])
Out[139]:
In [114]:
df.mean()
Out[114]:
In [116]:
df.mean(1)
Out[116]:
In [118]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
Out[118]:
In [120]:
print df
df.sub(s, axis='index')
Out[120]:
In [125]:
a = np.array([[1,2,3], [4,5,6]])
a
Out[125]:
In [126]:
np.cumsum(a)
Out[126]:
In [127]:
np.cumsum(a,axis=0)
Out[127]:
In [128]:
np.cumsum(a,axis=1)
Out[128]:
In [130]:
print df
df.apply(np.cumsum)
Out[130]:
In [136]:
df.apply(lambda x: x.max())
Out[136]:
In [138]:
df.apply(lambda x: x.min())
Out[138]:
In [135]:
df.apply(lambda x: x.max() - x.min())
Out[135]:
In [154]:
numpy.random.seed(3141)
s = pd.Series(np.random.randint(0, 7, size=10))
s
Out[154]:
In [155]:
s.value_counts()
Out[155]:
In [158]:
print df
df.apply(lambda x: x.value_counts())
Out[158]:
In [160]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
Out[160]:
In [165]:
s.str.lower()
Out[165]:
In [167]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
Out[167]:
In [169]:
df2.E.str.lower()
Out[169]:
In [170]:
df = pd.DataFrame(np.random.randn(10, 4))
df
Out[170]:
In [172]:
pieces = [df[:3], df[3:7], df[7:]]
pieces
Out[172]:
In [173]:
pd.concat(pieces)
Out[173]:
In [175]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
left
Out[175]:
In [176]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
right
Out[176]:
In [177]:
pd.merge(left, right, on='key')
Out[177]:
In [178]:
pd.merge(right, left, on='key')
Out[178]:
In [179]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df
Out[179]:
In [180]:
s = df.iloc[3]
s
Out[180]:
In [182]:
df.append(s)
Out[182]:
In [181]:
df.append(s, ignore_index=True)
Out[181]:
Splitting the data into groups, Applying a function to each group, Combining the results into a data structure
In the apply step, we might wish to one of the following:
Aggregation: computing a summary statistic (or statistics) about each group. Some examples:
Compute group sums or means
Compute group sizes / counts
Transformation: perform some group-specific computations and return a like-indexed. Some examples:
Standardizing data (zscore) within group
Filling NAs within groups with a value derived from each group
Filtration: discard some groups, according to a group-wise computation that evaluates True or False. Some examples:
Discarding data that belongs to groups with only a few members
Filtering out data based on the group sum or mean
Some combination of the above: GroupBy will examine the results of the apply step and try to return a sensibly combined result if it doesn’t fit into either of the above two categories
In [189]:
numpy.random.seed(3141)
df = pd.DataFrame({
'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)}
)
df
Out[189]:
In [206]:
print df.groupby('A').count()
print df.groupby('B').count()
print df.groupby('C').count()
print df.groupby('D').count()
print df.groupby(['A', 'B']).count()
In [217]:
def get_letter_type(letter):
if letter.lower() in 'aeiou':
return 'vowel'
else:
return 'consonant'
df.groupby(get_letter_type, axis=1).sum()
Out[217]:
In [218]:
df.groupby('A').groups
Out[218]:
In [219]:
df.groupby(get_letter_type, axis=1).groups
Out[219]:
In [220]:
df.groupby(['A', 'B']).groups
Out[220]:
In [221]:
len(df.groupby(['A', 'B']).groups)
Out[221]:
In [239]:
def printgroups(groups):
for name, group in groups:
print(name)
print(group)
def printgroups2(groups):
for (k1, k2), group in groups:
print(k1)
print(k2)
print(group)
In [241]:
print df
printgroups(df.groupby(['A']))
printgroups(df.groupby(['B','A']))
printgroups2(df.groupby(['B','A']))
In [243]:
grouped = df.groupby('A')
grouped.get_group('bar')
Out[243]:
In [244]:
df.groupby(['A', 'B']).get_group(('bar', 'one'))
Out[244]:
In [249]:
df
Out[249]:
In [250]:
df.groupby('A').sum()
Out[250]:
In [251]:
df.groupby(['A','B']).sum()
Out[251]:
In [252]:
df.groupby(['B','A']).sum()
Out[252]:
In [253]:
grouped = df.groupby('A')
grouped.aggregate(np.sum)
Out[253]:
In [254]:
grouped = df.groupby(['A', 'B'], as_index=False)
grouped.aggregate(np.sum)
Out[254]:
In [256]:
df.groupby(['A', 'B']).sum().reset_index()
Out[256]:
In [255]:
df.groupby('A', as_index=False).sum()
Out[255]:
In [257]:
df.groupby(['A', 'B']).size()
Out[257]:
In [258]:
df.groupby(['A', 'B']).describe()
Out[258]:
In [270]:
grouped = df.groupby('A')
grouped.groups
Out[270]:
In [271]:
grouped.agg([np.sum, np.mean, np.std]).reset_index()
Out[271]:
In [272]:
grouped['C'].agg([np.sum, np.mean, np.std]).reset_index()
Out[272]:
In [275]:
grouped['D'].agg({'Sum' : np.sum, 'Mean' : np.mean}).reset_index()
Out[275]:
In [277]:
grouped.agg([np.sum, np.mean, np.std]).reset_index()
Out[277]:
In [281]:
grouped.agg({'C' : np.sum, 'D' : lambda x: np.std(x, ddof=1)}).reset_index()
Out[281]:
In [282]:
grouped.agg({'C' : 'sum', 'D' : 'std'}).reset_index()
Out[282]:
In [283]:
df.groupby('A').sum().reset_index()
Out[283]:
In [286]:
index = pd.date_range('10/1/1999', periods=1100)
ts = pd.Series(np.random.normal(0.5, 2, 1100), index)
print ts.head()
ts = pd.rolling_mean(ts, 100, 100).dropna()
print ts.head()
In [290]:
ts.tail()
Out[290]:
In [295]:
key = lambda x: x.year
zscore = lambda x: (x - x.mean()) / x.std()
transformed = ts.groupby(key).transform(zscore)
In [296]:
grouped = ts.groupby(key)
grouped.mean()
Out[296]:
In [297]:
grouped.std()
Out[297]:
In [298]:
grouped_trans = transformed.groupby(key)
In [299]:
grouped_trans.mean()
Out[299]:
In [300]:
grouped_trans.std()
Out[300]:
In [301]:
compare = pd.DataFrame({'Original': ts, 'Transformed': transformed})
In [302]:
compare.plot()
Out[302]:
In [ ]: